In [1]:
import pandas as pd
import numpy as np
import math
import json
import pickle

import plotly.graph_objects as go
from datetime import date
from tqdm import tqdm
In [2]:
# read in the json files
portfolio = pd.read_json('Starbucks_Offers/data/portfolio.json', orient='records', lines=True)
profile = pd.read_json('Starbucks_Offers/data/profile.json', orient='records', lines=True)
transcript = pd.read_json('Starbucks_Offers/data/transcript.json', orient='records', lines=True)
In [3]:
profile.head()
Out[3]:
gender age id became_member_on income
0 None 118 68be06ca386d4c31939f3a4f0e3dd783 20170212 NaN
1 F 55 0610b486422d4921ae7d2bf64640c50b 20170715 112000.0
2 None 118 38fe809add3b4fcf9315a9694bb96ff5 20180712 NaN
3 F 75 78afa995795e4d85b5d9ceeca43f5fef 20170509 100000.0
4 None 118 a03223e636434f42ac4c3df47e8bac43 20170804 NaN
In [4]:
sum(pd.isnull(profile['income']))
Out[4]:
2175
In [5]:
profile = profile.dropna()
In [6]:
profile['became_member_on'] = profile['became_member_on'].apply(lambda x: '/'.join([str(x)[:4], str(x)[4:6], str(x)[6:]]))

profile['became_member_on'] = pd.to_datetime(profile['became_member_on'])

profile.head()
Out[6]:
gender age id became_member_on income
1 F 55 0610b486422d4921ae7d2bf64640c50b 2017-07-15 112000.0
3 F 75 78afa995795e4d85b5d9ceeca43f5fef 2017-05-09 100000.0
5 M 68 e2127556f4f64592b11af22de27a7932 2018-04-26 70000.0
8 M 65 389bc3fa690240e798340f5a15918d5c 2018-02-09 53000.0
12 M 58 2eeac8d8feae4a8cad5a6af0499a211d 2017-11-11 51000.0
In [7]:
transcript['event'].unique()
Out[7]:
array(['offer received', 'offer viewed', 'transaction', 'offer completed'],
      dtype=object)
In [8]:
transcript['value'] = transcript['value'].apply(lambda x: list(x.values())[0])

transcript.head()
Out[8]:
person event value time
0 78afa995795e4d85b5d9ceeca43f5fef offer received 9b98b8c7a33c4b65b9aebfe6a799e6d9 0
1 a03223e636434f42ac4c3df47e8bac43 offer received 0b1e1539f2cc45b7b9fa7c272da2e1d7 0
2 e2127556f4f64592b11af22de27a7932 offer received 2906b810c7d4411798c6938adc9daaa5 0
3 8ec6ce2a7e7949b1bf142def7d0e0586 offer received fafdcd668e3743c1bb461111dcafc2a4 0
4 68617ca6246f4fbc85e91a2a49552598 offer received 4d5c57ea9a6940dd891ad53e9dbe8da0 0
In [9]:
transcript.shape
Out[9]:
(306534, 4)
In [10]:
np.sum(transcript.duplicated())
Out[10]:
397
In [11]:
transcript = transcript.drop_duplicates()
In [12]:
transcript['personvalue'] = transcript['person'].astype(str) + '-' + transcript['value'].astype(str) 

transcript.head()
Out[12]:
person event value time personvalue
0 78afa995795e4d85b5d9ceeca43f5fef offer received 9b98b8c7a33c4b65b9aebfe6a799e6d9 0 78afa995795e4d85b5d9ceeca43f5fef-9b98b8c7a33c4...
1 a03223e636434f42ac4c3df47e8bac43 offer received 0b1e1539f2cc45b7b9fa7c272da2e1d7 0 a03223e636434f42ac4c3df47e8bac43-0b1e1539f2cc4...
2 e2127556f4f64592b11af22de27a7932 offer received 2906b810c7d4411798c6938adc9daaa5 0 e2127556f4f64592b11af22de27a7932-2906b810c7d44...
3 8ec6ce2a7e7949b1bf142def7d0e0586 offer received fafdcd668e3743c1bb461111dcafc2a4 0 8ec6ce2a7e7949b1bf142def7d0e0586-fafdcd668e374...
4 68617ca6246f4fbc85e91a2a49552598 offer received 4d5c57ea9a6940dd891ad53e9dbe8da0 0 68617ca6246f4fbc85e91a2a49552598-4d5c57ea9a694...

What is the type of offer that leads most to transaction?

An offer which leads to a transaction is an offer that has been first viewed and then completed.

It can happen that an offer is being completed without the offer being viewed or before the offer being viewed.

For each person, we will then look at

  • time_viewed: the first time when the person has viewed the author
  • time_completed: the time when this person has completed the offer (the highest time if the offer hase been completed several times)

    I time_completed > time_viewed, then we can consider that this offer has effectively led to a transaction.

  • As the following cell takes time running, we will comment it after having saved the output in a pickle file.

    In [13]:
    # offer_viewed = transcript[transcript['event'] == 'offer viewed']['personvalue']
    
    # offer_viewed_and_completed = transcript[(transcript['personvalue'].isin(offer_viewed))
    #                                        & (transcript['event'] == 'offer completed')]
    
    # offers_leading_to_transaction = []
    # edge_cases = []
    # for offer in tqdm(offer_viewed_and_completed.sort_values('person')['personvalue'].unique()):
        
    #     # focus on the transcript for one offer for a user
    #     offer_timing = transcript[transcript['personvalue'] == offer].sort_values('time')
        
    #     # selecting the first time the offer was viewed
    #     time_viewed = offer_timing[offer_timing['event'] == 'offer viewed']['time'].iloc[0]
        
    #     # selecting the last time the offer was completed
    #     time_completed = offer_timing[offer_timing['event'] == 'offer completed']['time'].iloc[-1]
        
    #     if time_completed >= time_viewed:
    #         offers_leading_to_transaction.append(offer)
    #     else:
    #         edge_cases.append(offer)
    
    In [14]:
    # pickle.dump(offers_leading_to_transaction, open('Starbucks_Offers/offer_leading_to_transaction', 'wb'))
    # pickle.dump(edge_cases, open('Starbucks_Offers/edge_cases', 'wb'))
    
    offers_leading_to_transaction = pickle.load(open('Starbucks_Offers/offer_leading_to_transaction', 'rb'))
    edge_cases = pickle.load(open('Starbucks_Offers/edge_cases', 'rb'))
    

    Out of the offer received, percentage of offers that led effectively led to a transaction:

    In [15]:
    len(offers_leading_to_transaction)/len(transcript[transcript['event'] == 'offer received'])
    
    Out[15]:
    0.28018931001481445

    Out of the offer completed and viewed, percentage of offers viewed after having been completed

    In [16]:
    len(edge_cases)/(len(offers_leading_to_transaction) + len(edge_cases))
    
    Out[16]:
    0.12624693376941945
    In [17]:
    types_leading_to_buy = transcript[
        transcript['personvalue'].isin(offers_leading_to_transaction)
    ]['value'].value_counts(normalize=True).reset_index()
    
    types_leading_to_buy.columns = ['id', 'share_of_transactions']
    
    types_leading_to_buy = portfolio.merge(types_leading_to_buy,
                                           on='id',
                                           how='outer').sort_values(by='share_of_transactions',
                                                                    ascending=False)
    
    In [18]:
    overview_types = types_leading_to_buy.groupby('offer_type')['share_of_transactions'].sum()
    
    fig = go.Figure()
    
    fig.add_trace(go.Pie(labels=overview_types.index, 
                         values=overview_types))
    
    fig.update_layout({'width':500, 'height':500})
    
    fig.show()
    

    Overall, BOGO offers and discounts are responsible for the same number of transactions.

    In [19]:
    types_leading_to_buy
    
    Out[19]:
    reward channels difficulty duration offer_type id share_of_transactions
    6 2 [web, email, mobile, social] 10 10 discount fafdcd668e3743c1bb461111dcafc2a4 0.189977
    5 3 [web, email, mobile, social] 7 7 discount 2298d6c36e964ae4a3e7e9706d1fb8c2 0.181706
    8 5 [web, email, mobile, social] 5 5 bogo f19421c1d4aa40978ebb69ca19b0e20d 0.150550
    1 10 [web, email, mobile, social] 10 5 bogo 4d5c57ea9a6940dd891ad53e9dbe8da0 0.116646
    0 10 [email, mobile, social] 10 7 bogo ae264e3637204a6fb9bb56bc8210ddfd 0.112948
    3 5 [web, email, mobile] 5 7 bogo 9b98b8c7a33c4b65b9aebfe6a799e6d9 0.095546
    9 2 [web, email, mobile] 10 7 discount 2906b810c7d4411798c6938adc9daaa5 0.094494
    4 5 [web, email] 20 10 discount 0b1e1539f2cc45b7b9fa7c272da2e1d7 0.058133
    2 0 [web, email, mobile] 0 4 informational 3f207df678b143eea3cee63160fa8bed NaN
    7 0 [email, mobile, social] 0 3 informational 5a8bc65990b245e5a138643cd4eb9837 NaN

    What we can see here is that the discount offers of low difficulty and shared on all the channels, including social, tend to lead more to transactions than other discount offer that are more difficult to complete and shared through less channels.

    Between them, are the BOGO offer. It seems that a BOGO offer will work better if the difficulty is low and if it is shared on social.

    It is quite interesting to note that the reward does not make an offer more attractive. What seems to impact the most is the amount the person need to spend to get it as well as the place where the offer was shared.

    For the following of the study, we will name each offer for more convenience. We will use the ranking of each offer and their type to name them.

    We will leave the informational as they are because they won't be part of the offers leading to transaction.

    In [20]:
    types_leading_to_buy['offer_name'] = [
        'discount_1', 
        'discount_2', 
        'bogo_1', 
        'bogo_2', 
        'bogo_3',
        'bogo_4', 
        'discount_3', 
        'discount_4', 
        'informational', 
        'informational'
    ]
    

    What is the type of offers generating the more revenue?

    In [21]:
    transaction_offer_completed = transcript[
        (transcript['personvalue'].isin(offers_leading_to_transaction))
        &(transcript['event'] == 'offer completed')][['person', 'time', 'value']
                                                    ]
    
    transaction_offer_completed.head()
    
    Out[21]:
    person time value
    12658 9fa9ae8f57894cc9a3b8a9bbe0fc1b2f 0 2906b810c7d4411798c6938adc9daaa5
    12679 629fc02d56414d91bca360decdfa9288 0 9b98b8c7a33c4b65b9aebfe6a799e6d9
    12692 676506bad68e4161b9bbaffeb039626b 0 ae264e3637204a6fb9bb56bc8210ddfd
    12697 8f7dd3b2afe14c078eb4f6e6fe4ba97d 0 4d5c57ea9a6940dd891ad53e9dbe8da0
    12717 227f2d69e46a4899b70d48182822cff6 0 4d5c57ea9a6940dd891ad53e9dbe8da0
    In [22]:
    transactions = transcript[transcript['event'] == 'transaction'][['person', 'time', 'value']]
    
    revenue_per_offer = transaction_offer_completed.merge(transactions, how='left', left_on=['person','time'], right_on = ['person','time'])
    
    revenue_per_offer.head()
    
    Out[22]:
    person time value_x value_y
    0 9fa9ae8f57894cc9a3b8a9bbe0fc1b2f 0 2906b810c7d4411798c6938adc9daaa5 34.56
    1 629fc02d56414d91bca360decdfa9288 0 9b98b8c7a33c4b65b9aebfe6a799e6d9 33.9
    2 676506bad68e4161b9bbaffeb039626b 0 ae264e3637204a6fb9bb56bc8210ddfd 18.01
    3 8f7dd3b2afe14c078eb4f6e6fe4ba97d 0 4d5c57ea9a6940dd891ad53e9dbe8da0 19.11
    4 227f2d69e46a4899b70d48182822cff6 0 4d5c57ea9a6940dd891ad53e9dbe8da0 28.39
    In [23]:
    revenue_per_type = revenue_per_offer.groupby('value_x')['value_y'].sum().reset_index()
    
    revenue_per_type.columns = ['id', 'revenue']
    revenue_per_type['revenue_share'] = revenue_per_type['revenue'] / revenue_per_type['revenue'].sum()
    
    revenue_per_type = revenue_per_type.drop('revenue', axis=1)
    
    
    types_leading_to_buy.merge(revenue_per_type,
                                on='id',
                                how='outer').sort_values('revenue_share', ascending=False)
    
    Out[23]:
    reward channels difficulty duration offer_type id share_of_transactions offer_name revenue_share
    0 2 [web, email, mobile, social] 10 10 discount fafdcd668e3743c1bb461111dcafc2a4 0.189977 discount_1 0.171945
    1 3 [web, email, mobile, social] 7 7 discount 2298d6c36e964ae4a3e7e9706d1fb8c2 0.181706 discount_2 0.154926
    2 5 [web, email, mobile, social] 5 5 bogo f19421c1d4aa40978ebb69ca19b0e20d 0.150550 bogo_1 0.145593
    3 10 [web, email, mobile, social] 10 5 bogo 4d5c57ea9a6940dd891ad53e9dbe8da0 0.116646 bogo_2 0.135470
    4 10 [email, mobile, social] 10 7 bogo ae264e3637204a6fb9bb56bc8210ddfd 0.112948 bogo_3 0.130988
    6 2 [web, email, mobile] 10 7 discount 2906b810c7d4411798c6938adc9daaa5 0.094494 discount_3 0.095262
    5 5 [web, email, mobile] 5 7 bogo 9b98b8c7a33c4b65b9aebfe6a799e6d9 0.095546 bogo_4 0.087737
    7 5 [web, email] 20 10 discount 0b1e1539f2cc45b7b9fa7c272da2e1d7 0.058133 discount_4 0.078078
    8 0 [web, email, mobile] 0 4 informational 3f207df678b143eea3cee63160fa8bed NaN informational NaN
    9 0 [email, mobile, social] 0 3 informational 5a8bc65990b245e5a138643cd4eb9837 NaN informational NaN

    The revenue led by each offers follows the same ranking as the number of transactions which means that an offer might not lead to spend always more than another one.

    What is the profile of the persons completing the offers

    In [24]:
    focus_transcript = transcript[(transcript['personvalue'].isin(offers_leading_to_transaction))
                                 & (transcript['event'] == 'offer completed')]
    
    sum(focus_transcript[['person', 'value']].duplicated())
    
    Out[24]:
    3866

    There are 3866 times were the same person uses the offer several times. We will leave these duplicated lines as they provide us with the info of a specific offer working on certain types of person (even though we hope at the end winning this consumer as somebody that buys without any offer).

    In [25]:
    offers_and_persons = focus_transcript[['person', 'value']].merge(types_leading_to_buy[['id', 'offer_name']],
                                                                     how='left',
                                                                     left_on='value',
                                                                     right_on='id')
    
    offers_and_persons = offers_and_persons.drop(['id', 'value'], axis=1)
    offers_and_persons
    
    Out[25]:
    person offer_name
    0 9fa9ae8f57894cc9a3b8a9bbe0fc1b2f discount_3
    1 629fc02d56414d91bca360decdfa9288 bogo_4
    2 676506bad68e4161b9bbaffeb039626b bogo_3
    3 8f7dd3b2afe14c078eb4f6e6fe4ba97d bogo_2
    4 227f2d69e46a4899b70d48182822cff6 bogo_2
    ... ... ...
    25233 0c027f5f34dd4b9eba0a25785c611273 discount_2
    25234 a6f84f4e976f44508c358cc9aba6d2b3 discount_2
    25235 b895c57e8cd047a8872ce02aa54759d6 discount_1
    25236 8431c16f8e1d440880db371a68f82dd0 discount_1
    25237 24f56b5e1849462093931b164eb803b5 discount_1

    25238 rows × 2 columns

    Now let's use the profiling of users to understand better which user will use which offer.

    In [26]:
    offers_and_persons = offers_and_persons.merge(profile, how='left', left_on='person', right_on='id')
    offers_and_persons
    
    Out[26]:
    person offer_name gender age id became_member_on income
    0 9fa9ae8f57894cc9a3b8a9bbe0fc1b2f discount_3 M 42.0 9fa9ae8f57894cc9a3b8a9bbe0fc1b2f 2016-01-17 96000.0
    1 629fc02d56414d91bca360decdfa9288 bogo_4 M 52.0 629fc02d56414d91bca360decdfa9288 2018-06-05 72000.0
    2 676506bad68e4161b9bbaffeb039626b bogo_3 M 37.0 676506bad68e4161b9bbaffeb039626b 2017-05-15 92000.0
    3 8f7dd3b2afe14c078eb4f6e6fe4ba97d bogo_2 M 48.0 8f7dd3b2afe14c078eb4f6e6fe4ba97d 2015-09-03 62000.0
    4 227f2d69e46a4899b70d48182822cff6 bogo_2 M 51.0 227f2d69e46a4899b70d48182822cff6 2018-05-12 100000.0
    ... ... ... ... ... ... ... ...
    25233 0c027f5f34dd4b9eba0a25785c611273 discount_2 M 56.0 0c027f5f34dd4b9eba0a25785c611273 2017-10-24 61000.0
    25234 a6f84f4e976f44508c358cc9aba6d2b3 discount_2 NaN NaN NaN NaT NaN
    25235 b895c57e8cd047a8872ce02aa54759d6 discount_1 NaN NaN NaN NaT NaN
    25236 8431c16f8e1d440880db371a68f82dd0 discount_1 M 39.0 8431c16f8e1d440880db371a68f82dd0 2018-06-27 39000.0
    25237 24f56b5e1849462093931b164eb803b5 discount_1 F 48.0 24f56b5e1849462093931b164eb803b5 2017-12-28 80000.0

    25238 rows × 7 columns

    1. The gender

    In [27]:
    sum(pd.isnull(offers_and_persons['gender']))
    
    Out[27]:
    974
    In [28]:
    # Removing Nan values
    
    offers_and_persons = offers_and_persons[~pd.isnull(offers_and_persons['gender'])]
    
    In [29]:
    gender_overall = offers_and_persons['gender'].value_counts(normalize=True)
    
    fig = go.Figure()
    
    fig.add_trace(go.Pie(values=gender_overall, 
                         labels=gender_overall.index))
    
    fig.update_layout(width=500, height=500)
    
    fig.show()
    

    In the transactions we are looking at, we have almost the same number of males and females. Let's look at this stat per offer.

    In [30]:
    gender_per_offer = offers_and_persons.groupby(['offer_name', 'gender'])['id'].count().reset_index()
    
    gender_per_offer = gender_per_offer.merge(gender_per_offer.groupby('offer_name')['id'].sum(),
                                              how='left',
                                              on='offer_name')
    
    gender_per_offer['id_x'] = gender_per_offer['id_x']/gender_per_offer['id_y']
    
    gender_per_offer.head()
    
    Out[30]:
    offer_name gender id_x id_y
    0 bogo_1 F 0.463421 3554
    1 bogo_1 M 0.519415 3554
    2 bogo_1 O 0.017164 3554
    3 bogo_2 F 0.534213 2879
    4 bogo_2 M 0.451198 2879
    In [31]:
    fig = go.Figure()
    for gender in ['F', 'M', 'O']:    
                
        fig.add_trace(go.Bar(
            x=gender_per_offer['offer_name'].unique(),
            y=gender_per_offer[(gender_per_offer['gender'] == gender)]['id_x'], name=gender))
    
    
    fig.update_layout(barmode='stack', xaxis={'categoryorder':'category ascending'})
    fig.show()
    

    The share of women seems to be slightly higher for BOGO. The share of men seems to be slightly higher for discounts.

    2. The age

    Let's look at the age distribution of the persons of interest.

    In [32]:
    fig = go.Figure()
    
    fig.add_trace(go.Histogram(x=offers_and_persons['age'], histnorm='percent', nbinsx=10))
    
    fig.show()
    

    We can see that most of the persons are around their 50s. Let's see if a certain offer is more popular depending on the age.

    In [33]:
    age_per_offer = offers_and_persons.groupby('offer_name')['age'].mean()
    
    age_per_offer
    
    Out[33]:
    offer_name
    bogo_1        55.810917
    bogo_2        56.965266
    bogo_3        55.796725
    bogo_4        55.849554
    discount_1    55.185390
    discount_2    54.878984
    discount_3    56.087500
    discount_4    56.653226
    Name: age, dtype: float64

    It is hard to tell by looking at the average age per offer if there is a link between the age and the offer which leads to a transaction.

    3. Income

    In [34]:
    sum(pd.isnull(offers_and_persons['income']))
    
    Out[34]:
    0
    In [35]:
    fig = go.Figure()
    
    fig.add_trace(go.Histogram(x=offers_and_persons['income'], histnorm='percent'))#, nbinsx=10))
    
    fig.show()
    

    The income distribution does not look like to be following a Normal low (contrary to the age).

    Let's see if we can see a link between the offer and the income of a person.

    In [36]:
    income_per_offer = offers_and_persons.groupby('offer_name')['income'].median()
    
    fig = go.Figure()
    
    fig.add_trace(go.Bar(x=income_per_offer.index, 
                         y=income_per_offer))
    
    fig.show()
    

    Slightly higher income will complete the BOGO offer, which can make sense.

    4. Date of start membership

    Let's transform this variable so that it translate the time lapse between the day when the person has become a member and today.

    In [37]:
    ref_date = offers_and_persons['became_member_on'].max()
    ref_date
    
    Out[37]:
    Timestamp('2018-07-26 00:00:00')
    In [38]:
    offers_and_persons['seniority'] = offers_and_persons['became_member_on'].apply(lambda x:(ref_date - x).days)
    
    offers_and_persons.head()
    
    /usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:
    
    
    A value is trying to be set on a copy of a slice from a DataFrame.
    Try using .loc[row_indexer,col_indexer] = value instead
    
    See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
    
    
    Out[38]:
    person offer_name gender age id became_member_on income seniority
    0 9fa9ae8f57894cc9a3b8a9bbe0fc1b2f discount_3 M 42.0 9fa9ae8f57894cc9a3b8a9bbe0fc1b2f 2016-01-17 96000.0 921
    1 629fc02d56414d91bca360decdfa9288 bogo_4 M 52.0 629fc02d56414d91bca360decdfa9288 2018-06-05 72000.0 51
    2 676506bad68e4161b9bbaffeb039626b bogo_3 M 37.0 676506bad68e4161b9bbaffeb039626b 2017-05-15 92000.0 437
    3 8f7dd3b2afe14c078eb4f6e6fe4ba97d bogo_2 M 48.0 8f7dd3b2afe14c078eb4f6e6fe4ba97d 2015-09-03 62000.0 1057
    4 227f2d69e46a4899b70d48182822cff6 bogo_2 M 51.0 227f2d69e46a4899b70d48182822cff6 2018-05-12 100000.0 75
    In [39]:
    fig = go.Figure()
    fig.add_trace(go.Histogram(x=offers_and_persons['seniority']))
    
    fig.show()
    

    It is clear that the more recent a client is, the more likely he will use an offer.

    What about the person that complete a transaction without any offer?

    In [40]:
    no_offer_and_person = transcript[(~transcript['personvalue'].isin(offers_leading_to_transaction))
                                    & (transcript['event'] == 'transaction')]
    
    no_offer_and_person = no_offer_and_person[['person', 'value']].merge(profile, how='left', left_on='person', right_on='id')
    no_offer_and_person
    
    Out[40]:
    person value gender age id became_member_on income
    0 02c083884c7d45b39cc68e1314fec56c 0.83 F 20.0 02c083884c7d45b39cc68e1314fec56c 2016-07-11 30000.0
    1 9fa9ae8f57894cc9a3b8a9bbe0fc1b2f 34.56 M 42.0 9fa9ae8f57894cc9a3b8a9bbe0fc1b2f 2016-01-17 96000.0
    2 54890f68699049c2a04d415abc25e717 13.23 M 36.0 54890f68699049c2a04d415abc25e717 2017-12-28 56000.0
    3 b2f1cd155b864803ad8334cdf13c4bd2 19.51 F 55.0 b2f1cd155b864803ad8334cdf13c4bd2 2017-10-16 94000.0
    4 fe97aa22dd3e48c8b143116a8403dd52 18.97 F 39.0 fe97aa22dd3e48c8b143116a8403dd52 2017-12-17 67000.0
    ... ... ... ... ... ... ... ...
    138948 b3a1272bc9904337b331bf348c3e8c17 1.59 M 66.0 b3a1272bc9904337b331bf348c3e8c17 2018-01-01 47000.0
    138949 68213b08d99a4ae1b0dcb72aebd9aa35 9.53 M 52.0 68213b08d99a4ae1b0dcb72aebd9aa35 2018-04-08 62000.0
    138950 a00058cf10334a308c68e7631c529907 3.61 F 63.0 a00058cf10334a308c68e7631c529907 2013-09-22 52000.0
    138951 76ddbd6576844afe811f1a3c0fbb5bec 3.53 M 57.0 76ddbd6576844afe811f1a3c0fbb5bec 2016-07-09 40000.0
    138952 c02b10e8752c4d8e9b73f918558531f7 4.05 NaN NaN NaN NaT NaN

    138953 rows × 7 columns

    1. Gender

    In [41]:
    sum(pd.isnull(no_offer_and_person['gender'])) / len(no_offer_and_person)
    
    Out[41]:
    0.1079213834893813
    In [42]:
    no_offer_and_person = no_offer_and_person[~pd.isnull(no_offer_and_person['gender'])]
    
    In [43]:
    fig = go.Figure()
    
    fig.add_trace(go.Pie(values=no_offer_and_person['gender'].value_counts(normalize=True), 
                         labels=no_offer_and_person['gender'].value_counts(normalize=True).index))
    
    fig.show()
    

    There are more males that complete a transaction without seeing any offer.

    2. Age

    In [44]:
    fig = go.Figure()
    
    fig.add_trace(go.Histogram(x=no_offer_and_person['age'], histnorm='percent', nbinsx=10))
    
    fig.show()
    

    The age distribution looks the same as with the offers.

    3. Income

    In [45]:
    fig = go.Figure()
    
    fig.add_trace(go.Histogram(x=no_offer_and_person['income'], histnorm='percent', name='no offer', nbinsx=20))
    
    fig.add_trace(go.Histogram(x=offers_and_persons['income'], histnorm='percent', name='offer', marker=dict(color='gray'), nbinsx=20))
    
    fig.update_layout(barmode='overlay')
    fig.update_traces(opacity=0.75)
    
    fig.show()
    

    People that don't use an offer don't have a higher income when looking at both distributions.

    Still, those two distributions are quite different.

    Date of membership

    In [46]:
    no_offer_and_person['seniority'] = no_offer_and_person['became_member_on'].apply(lambda x:(ref_date - x).days)
    
    /usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:
    
    
    A value is trying to be set on a copy of a slice from a DataFrame.
    Try using .loc[row_indexer,col_indexer] = value instead
    
    See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
    
    
    In [47]:
    fig = go.Figure()
    
    fig.add_trace(go.Histogram(x=no_offer_and_person['seniority'], histnorm='percent', name='no offer', nbinsx=20))
    
    fig.add_trace(go.Histogram(x=offers_and_persons['seniority'], histnorm='percent', name='offer', marker=dict(color='gray'), nbinsx=20))
    
    fig.update_layout(barmode='overlay')
    fig.update_traces(opacity=0.75)
    
    fig.show()
    

    People using an offer or not have the same distribution of date if membership.

    End of data cleaning

    As the no-offer profile is interesting to exploit, we will add an offer call 'no_offer' which will mean that for this type of person, we should not show any offer. We will thus add these profile to our main datafram offers_and_person.

    In [48]:
    no_offer_and_person['offer_name'] = 'no_offer'
    
    no_offer_and_person.head()
    
    /usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:
    
    
    A value is trying to be set on a copy of a slice from a DataFrame.
    Try using .loc[row_indexer,col_indexer] = value instead
    
    See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
    
    
    Out[48]:
    person value gender age id became_member_on income seniority offer_name
    0 02c083884c7d45b39cc68e1314fec56c 0.83 F 20.0 02c083884c7d45b39cc68e1314fec56c 2016-07-11 30000.0 745 no_offer
    1 9fa9ae8f57894cc9a3b8a9bbe0fc1b2f 34.56 M 42.0 9fa9ae8f57894cc9a3b8a9bbe0fc1b2f 2016-01-17 96000.0 921 no_offer
    2 54890f68699049c2a04d415abc25e717 13.23 M 36.0 54890f68699049c2a04d415abc25e717 2017-12-28 56000.0 210 no_offer
    3 b2f1cd155b864803ad8334cdf13c4bd2 19.51 F 55.0 b2f1cd155b864803ad8334cdf13c4bd2 2017-10-16 94000.0 283 no_offer
    4 fe97aa22dd3e48c8b143116a8403dd52 18.97 F 39.0 fe97aa22dd3e48c8b143116a8403dd52 2017-12-17 67000.0 221 no_offer
    In [49]:
    offers_and_persons.head()
    
    Out[49]:
    person offer_name gender age id became_member_on income seniority
    0 9fa9ae8f57894cc9a3b8a9bbe0fc1b2f discount_3 M 42.0 9fa9ae8f57894cc9a3b8a9bbe0fc1b2f 2016-01-17 96000.0 921
    1 629fc02d56414d91bca360decdfa9288 bogo_4 M 52.0 629fc02d56414d91bca360decdfa9288 2018-06-05 72000.0 51
    2 676506bad68e4161b9bbaffeb039626b bogo_3 M 37.0 676506bad68e4161b9bbaffeb039626b 2017-05-15 92000.0 437
    3 8f7dd3b2afe14c078eb4f6e6fe4ba97d bogo_2 M 48.0 8f7dd3b2afe14c078eb4f6e6fe4ba97d 2015-09-03 62000.0 1057
    4 227f2d69e46a4899b70d48182822cff6 bogo_2 M 51.0 227f2d69e46a4899b70d48182822cff6 2018-05-12 100000.0 75
    In [50]:
    cols_to_keep = ['person', 'offer_name', 'gender', 'age', 'seniority']
    
    df = pd.concat([offers_and_persons[cols_to_keep], 
                    no_offer_and_person[cols_to_keep]], 
                    axis=0, 
                    sort=False)
    
    In [51]:
    df['offer_name'].value_counts(normalize=True)
    
    Out[51]:
    no_offer      0.836299
    discount_1    0.030569
    discount_2    0.029213
    bogo_1        0.023978
    bogo_2        0.019424
    bogo_3        0.018951
    bogo_4        0.015875
    discount_3    0.015652
    discount_4    0.010039
    Name: offer_name, dtype: float64

    We notice that when we add the no offer type, we result in an imbalance dataset. We will then exclude them for now and limit ourselves to the offers_and_persons dataset.

    In [52]:
    df = offers_and_persons[['offer_name', 'gender', 'age', 'income', 'seniority']]
    
    df.head()
    
    Out[52]:
    offer_name gender age income seniority
    0 discount_3 M 42.0 96000.0 921
    1 bogo_4 M 52.0 72000.0 51
    2 bogo_3 M 37.0 92000.0 437
    3 bogo_2 M 48.0 62000.0 1057
    4 bogo_2 M 51.0 100000.0 75

    Let's one hot encode the gender column.

    In [53]:
    df = pd.concat([df.drop('gender', axis=1), 
                    pd.get_dummies(df['gender'], prefix='gender', prefix_sep='_', drop_first=True)],
                   axis=1, 
                   sort=False)
    
    df.head()
    
    Out[53]:
    offer_name age income seniority gender_M gender_O
    0 discount_3 42.0 96000.0 921 1 0
    1 bogo_4 52.0 72000.0 51 1 0
    2 bogo_3 37.0 92000.0 437 1 0
    3 bogo_2 48.0 62000.0 1057 1 0
    4 bogo_2 51.0 100000.0 75 1 0
    In [54]:
    df['offer_name'].value_counts(normalize=True)
    
    Out[54]:
    discount_1    0.186738
    discount_2    0.178454
    bogo_1        0.146472
    bogo_2        0.118653
    bogo_3        0.115768
    bogo_4        0.096975
    discount_3    0.095615
    discount_4    0.061325
    Name: offer_name, dtype: float64
    In [55]:
    pd.isnull(df).sum()
    
    Out[55]:
    offer_name    0
    age           0
    income        0
    seniority     0
    gender_M      0
    gender_O      0
    dtype: int64
    In [56]:
    df.to_csv('clean_data.csv')
    
    In [1]:
    from subprocess import call
    call(['python', '-m', 'nbconvert', 'explore_and_clean.ipynb'])
    
    Out[1]:
    1
    In [ ]: